Exploratory analysis

Author

Denis Shah

Published

March 4, 2025

Packages

library(tidyverse)
library(ggthemes)
library(grid)
library(gridExtra)
library(patchwork)

# tidyfun is currently not on CRAN. You can install the development version from GitHub with:
# # install.packages("pak")
# pak::pak("tidyfun/tidyfun")
library(tidyfun)
# my modifications to the geom-spaghetti function (updated ggplot linewidth instead of size):
source(here::here("FunctionalDataAnalysis", "FDAExploratory", "geom-spaghetti.R"))

# theme_half_open is part of the cowplot themes
# theme_set(theme_half_open(font_size = 12))
wm_load <- readr::read_csv(here::here("Data", "data_model_plus_weather_filtered.csv"), show_col_types = FALSE)
# Simplify things. Keep only the weather-related variables and others needed for calculations
wm_data <-
  wm_load %>%
  dplyr::select(subject, date, planting.date, sampling.date, wm, d2m, t2m_mean, t2m_max, t2m_min, st, sp, sm, rh) %>%
  # Do the filtering steps before doing any calculations or feature engineering:
  # wm_load has identical data for each of the sampling dates, which is why there is a filtering step on sampling.date.
  dplyr::group_by(subject) %>% 
  dplyr::filter(sampling.date == max(sampling.date)) %>%
  dplyr::ungroup() %>% 
  # Add the response variable (wm present or absent; binary):
  dplyr::group_by(subject) %>% 
  dplyr::mutate(wm = (mean(wm, na.rm = T) > 0)*1) %>% 
  # wm as a factor:
  dplyr::mutate(wm = factor(wm, levels = c(0, 1))) %>%
  dplyr::ungroup() %>% 
  dplyr::filter(!is.na(wm)) %>% 
  # Calculate dap (as a numeric):
  dplyr::mutate(dap = as.numeric(date - planting.date)) %>%
  # Convert temperatures from Kelvin to degrees Celsius:
  dplyr::mutate(across(d2m:st, ~ .x - 273.15)) %>%
  # dewpoint depression:
  dplyr::mutate(dpd = t2m_mean - d2m) %>%
  # surface pressure in kPa:
  dplyr::mutate(sp = sp/1000) %>%
  # Ratio of soil temperature to soil moisture:
  dplyr::mutate(stsm = st/sm) %>%
  # Difference in max and min temperatures:
  dplyr::mutate(tdiff = t2m_max - t2m_min) %>%
  # estimate GDD (base 0). NB: base 0 is reasonable for snap bean; see the Jenni et al. (2000) paper. I think we want GDD to start accumulating the day after planting date onwards...
  # I use the day after planting, because we don't know exactly what time of the day the field was planted.
  dplyr::mutate(gddi = ifelse(dap <= 0, 0, (t2m_max + t2m_min)*0.5 - 0)) %>%
  dplyr::group_by(subject) %>% 
  # We don't want the gddi column after creating gdd:
  dplyr::mutate(gdd = cumsum(gddi), .keep = "unused") %>%
  dplyr::ungroup() %>%
  # Keep only the columns you need:
  dplyr::select(subject, dap, wm:gdd)
# To avoid problems with domain-related calculations in fda, check on the range of dap for each subject.
wm_data %>%
  dplyr::group_by(subject) %>%
  dplyr::summarise(min_dap = min(dap), max_dap = max(dap)) %>%
  # min_dap is -30 for all subjects, but max_dap varies:
  # print(n = Inf)
  # dplyr::pull(max_dap) %>%
  # range() # 50 to 77
  ggplot(., aes(x = max_dap)) +
  geom_histogram(binwidth = 1, fill = "#69b3a2", colour = "black", alpha = 0.8) +
  theme_bw() +
  xlab("Max dap") +
  ylab("Frequency")

# The empirical cdf for max_dap:
wm_data %>%
  dplyr::group_by(subject) %>%
  dplyr::summarise(min_dap = min(dap), max_dap = max(dap)) %>%
  ggplot(., aes(max_dap)) + 
  stat_ecdf(geom = "step") +
  theme_bw() +
  labs(x = "Max dap", y = "ECDF")


# Create a tf object:
foo_df <-
  wm_data %>%
  dplyr::select(subject, dap, wm, d2m) %>% 
  # Not restricting the dap range (max_dap is variable by subject):
  tf_nest(d2m, .id = subject, .arg = dap)

# Plot the data for a few subjects:  
foo_df %>% 
  dplyr::filter(subject %in% 1:5) %>%
  ggplot(aes(tf = d2m, colour = factor(wm))) + 
  geom_spaghetti(alpha = 1)


# The mean for each wm group, then a lowess smooth.
# Of course, you have to choose a proper span parameter (f argument) to avoid over-smoothing. 
# tf_smooth uses f = .15 as the default instead of the typical .75
foo_df %>% 
  dplyr::group_by(wm) %>% 
  dplyr::summarize(mean_d2m = mean(d2m)) %>% 
  dplyr::mutate(smooth_mean = tf_smooth(mean_d2m, method = "lowess", f = 0.15)) %>%
  ggplot(aes(tf = smooth_mean, color = wm)) +
  geom_spaghetti(linewidth = 1.25, alpha = 1)

# Using a spline basis representation to smooth the means:
foo_df %>% 
  dplyr::group_by(wm) %>% 
  dplyr::summarize(mean_d2m = mean(d2m)) %>% 
  dplyr::mutate(smooth_mean = tfb(mean_d2m)) %>%
  ggplot(aes(tf = smooth_mean, color = wm)) +
  geom_spaghetti(linewidth = 1.25, alpha = 1) +
  scale_x_continuous(breaks = seq(-30, 70, by = 10)) +
  annotate("rect", ymin = -Inf, ymax = Inf, xmin = 35, xmax = 50, 
           fill = "steelblue", alpha = 0.2) +
  geom_vline(xintercept = 0, color = "gray", linetype = "dashed") +
  scale_color_colorblind(labels = c( "Absent","Present")) +
  theme_bw() +
  labs(x = "Days relative to sowing",
       y = "foo",
       color = "White mold") +
  theme(legend.position = "bottom")


# Next, want to plot the difference between the wm curves
x <-
  foo_df %>% 
  group_by(wm) %>% 
  summarize(mean_d2m = mean(d2m)) %>% 
  # Will use a lowess smoother here instead of spline basis:
  mutate(smooth_mean = tf_smooth(mean_d2m, method = "lowess", f = 0.15)) %>%
  # mutate(smooth_mean = tfb(mean_d2m))
  # Prepping to unnest:
  select(-mean_d2m) %>%
  tf_unnest(cols = "smooth_mean") %>%
  # These steps are to be able to get the difference between the two mean curves
  pivot_wider(names_from = wm, values_from = smooth_mean_value, names_prefix = "wm=") %>%
  mutate(diff = `wm=1` - `wm=0`)


tibble(id = 1, dap = x$smooth_mean_arg, diff = x$diff) %>%
  # Get back into tfd format:
  tf_nest(diff, .id = id, .arg = dap) %>%
  ggplot(aes(tf = diff)) +
  geom_spaghetti(linewidth = 1.25, alpha = 1, color = "black") +
  geom_hline(yintercept = 0, color = "gray", linetype = "dashed") +
  theme_bw() +
  labs(x = "Days relative to sowing",
       y = "Difference",
       title = "foo")
# GOAL: Generalize the code for plotting the mean curves and difference between the mean curves:

wm.tfd <- function(x) {
  # Create a tidy functional data object for a weather variable
  # Args:
  #  x = unquoted variable name
  # Returns:
  #  a tidy functional data object
  #
  .x = enquo(x)
  
  # Create a tf object:
  df <-
    wm_data %>%
    dplyr::select(subject, dap, wm, !!.x) %>% 
    # Not restricting the dap range (max_dap is variable by subject):
    tf_nest(!!.x, .id = subject, .arg = dap)
  
  return(df)
}


wm.curves <- function(x, .ylab = NULL, ...) {
  # Plot the smoothed mean curves for wm = 0 and wm = 1
  # Args:
  #  x = unquoted variable name
  # Returns:
  #  a ggplot of the smoothed mean curves for wm = 0 and wm = 1
  .x = enquo(x)
  
  # Create a tf object:
  suppressMessages(
    wm_data %>%
      dplyr::select(subject, dap, wm, !!.x) %>% 
      # Create a tf object:
      # Not restricting the dap range (max_dap is variable by subject):
      tf_nest(!!.x, .id = subject, .arg = dap) %>%
      dplyr::group_by(wm) %>%
      # Get the means:
      dplyr::summarize(var_mean = mean(!!.x)) %>%
      # We used suppressMessages() to suppress output generated by the tfb arg:
      # Using a spline basis representation to smooth the means:
      dplyr::mutate(smooth_mean = tfb(var_mean)) %>%
      ggplot(aes(tf = smooth_mean, color = wm)) +
      geom_spaghetti(linewidth = 1.25, alpha = 1) +
      scale_x_continuous(breaks = seq(-30, 70, by = 10)) +
      annotate("rect", ymin = -Inf, ymax = Inf, xmin = 35, xmax = 50,
               fill = "steelblue", alpha = 0.2) +
      geom_vline(xintercept = 0, color = "gray", linetype = "dashed") +
      ggthemes::scale_color_colorblind(labels = c( "Absent","Present")) +
      theme_bw() +
      labs(x = "Days relative to planting",
           y = .ylab,
           color = "White mold") +
      theme(axis.title.y = element_text(size = 8)) +
      theme(
        axis.text.x = element_text(size = 7),  # Decrease x-axis tick label size
        axis.text.y = element_text(size = 7)  # Decrease y-axis tick label size
        ) +
      theme(legend.position = "bottom")
    )
} # end function wm.curves


wm.diff.curve <- function(x, .span = 0.15, my.title = NULL) {
  # Plot the difference between the wm curves
  # Args:
  #  x = unquoted variable name
  #  .span = span for the lowess smoother
  # Returns:
  #  a ggplot of the smoothed difference between the wm = 0 and wm = 1 mean curves
  
  .x = enquo(x)
  
  df <- wm.tfd(!!.x)
  
  z <-
    suppressMessages(
      df %>% 
      dplyr::group_by(wm) %>% 
      dplyr::summarize(var_mean = mean(!!.x)) %>% 
      # Will use a lowess smoother here instead of spline basis:
      dplyr::mutate(smooth_mean = tf_smooth(var_mean, method = "lowess", f = .span)) %>%
      # dplyr::mutate(smooth_mean = tfb(var_mean)) %>%
      # Prepping to unnest:
      dplyr::select(-var_mean) %>%
      tf_unnest(cols = "smooth_mean") %>%
      # These steps are to be able to get the difference between the two mean curves
      tidyr::pivot_wider(names_from = wm, values_from = smooth_mean_value, 
                         names_prefix = "wm=") %>%
      dplyr::mutate(diff = `wm=1` - `wm=0`) )
  
  tibble(id = 1, dap = z$smooth_mean_arg, diff = z$diff) %>%
    # Get back into tfd format:
    tf_nest(diff, .id = id, .arg = dap) %>%
    ggplot(aes(tf = diff)) +
    geom_spaghetti(linewidth = 1.25, alpha = 1, color = "black") +
    scale_x_continuous(breaks = seq(-30, 70, by = 10)) +
    annotate("rect", ymin = -Inf, ymax = Inf, xmin = 35, xmax = 50, 
           fill = "steelblue", alpha = 0.2) +
    geom_hline(yintercept = 0, color = "gray", linetype = "dashed") +
    geom_vline(xintercept = 0, color = "gray", linetype = "dashed") +
    theme_bw() +
    labs(x = "Days relative to sowing",
         y = "Difference",
         title = my.title)
  }  # end function wm.diff.curve


# Example of use:
# wm.diff.curve(d2m, .span = 0.15)

Environmental variables

Dew point

Means

# d2m = dewpoint temperature (2m)
wm.curves(d2m, .ylab = "Dew point (°C)")

Difference

wm.diff.curve(d2m, .span = 0.2, my.title = "Dew point")

Mean air temperature

Means

# t2m_mean = mean air temperature (2m)
wm.curves(t2m_mean, .ylab = "Mean air temperature (°C)")

Difference

wm.diff.curve(t2m_mean, .span = 0.2, my.title = "Mean air temperature")

Max air temperature

Means

# t2m_max = max air temperature (2m)
wm.curves(t2m_max, .ylab = "Max. air temperature (°C)")

Difference

wm.diff.curve(t2m_max, .span = .2, my.title = "Max. air temperature")

Min air temperature

Means

# t2m_min = min air temperature (2m)
wm.curves(t2m_min, .ylab = "Min. air temperature (°C)")

Difference

wm.diff.curve(t2m_min, my.title = "Min. air temperature")

Max - Min air temperature

Means

wm.curves(tdiff, .ylab = "Max. - Min. air temperature (°C)")

Difference

wm.diff.curve(tdiff, my.title = "Max. - Min. air temperature")

Dewpoint depression

Means

# dpd = dewpoint depression
wm.curves(dpd, .ylab = "Dew point depression (°C)")

Difference

wm.diff.curve(dpd, my.title = "Dew point depression")

Soil temperature

Means

# st = soil temperature 
wm.curves(st, .ylab = "Soil temperature (°C)")

Difference

wm.diff.curve(st, my.title = "Soil temperature")

Soil moisture

Means

# sm = soil moisture
wm.curves(sm, .ylab = "Soil Moisture (m³/m³)")

Difference

wm.diff.curve(sm, my.title = "Soil Moisture")

Ratio of soil temperature: soil moisture

Means

# stsm = Ratio of soil temperature: soil moisture
wm.curves(stsm, .ylab = "Soil temperature: Soil moisture")

Difference

wm.diff.curve(stsm, my.title = "Soil temperature: Soil moisture")

Surface pressure

Means

# sp = surface pressure
wm.curves(sp, .ylab = "Surface pressure (kPa)")

Difference

wm.diff.curve(sp, .span = .2, my.title = "Surface pressure")

Relative humidity

Means

# rh = relative humidity
wm.curves(rh, .ylab = "Relative Humidity (%)")

Difference

wm.diff.curve(rh, .span = .2, my.title = "Relative Humidity")

Growing degree days

Means

# gdd = growing degree days
wm.curves(gdd, .ylab = "Growing degree days")

Difference

wm.diff.curve(gdd, .span= 0.3, my.title = "Growing degree days")

Publication figure

# This is a replacement for Fig. 3 in the PNAS paper
p1 <- wm.curves(d2m, .ylab = "Dewpoint (°C)")
p2 <- wm.curves(t2m_mean, .ylab = "Air temperature (°C)")
p3 <- wm.curves(dpd, .ylab = "Dewpoint depression (°C)")
p4 <- wm.curves(st, .ylab = "Soil temperature (°C)")
p5 <- wm.curves(sm, .ylab = "Soil moisture (m³/m³)")
p6 <- wm.curves(stsm, .ylab = "Soil temperature:moisture ratio")
p7 <- wm.curves(sp, .ylab = "Surface pressure (kPa)")
p8 <- wm.curves(rh, .ylab = "Relative humidity (%)")
  
  
p1 + p2 + p3 + p4 + p5 + p6 + p7 + p8 +
  plot_annotation(tag_levels = 'A') +
  plot_layout(nrow = 4, byrow = T) + 
  plot_layout(guides = 'collect', axis_titles = "collect") &
  cowplot::theme_half_open(font_size = 12)&
  theme(legend.position='bottom',
        axis.title = element_text(size = 10))

ggsave("figs/functional_curves.png", dpi = 600, height = 11, width =8, bg = "white")

Session Info

sessionInfo()
R version 4.4.1 (2024-06-14 ucrt)
Platform: x86_64-w64-mingw32/x64
Running under: Windows 11 x64 (build 26100)

Matrix products: default


locale:
[1] LC_COLLATE=Portuguese_Brazil.utf8  LC_CTYPE=Portuguese_Brazil.utf8   
[3] LC_MONETARY=Portuguese_Brazil.utf8 LC_NUMERIC=C                      
[5] LC_TIME=Portuguese_Brazil.utf8    

time zone: America/Sao_Paulo
tzcode source: internal

attached base packages:
[1] grid      stats     graphics  grDevices datasets  utils     methods  
[8] base     

other attached packages:
 [1] tidyfun_0.0.98  tf_0.3.4        patchwork_1.3.0 gridExtra_2.3  
 [5] ggthemes_5.1.0  lubridate_1.9.3 forcats_1.0.0   stringr_1.5.1  
 [9] dplyr_1.1.4     purrr_1.0.2     readr_2.1.5     tidyr_1.3.1    
[13] tibble_3.2.1    ggplot2_3.5.1   tidyverse_2.0.0 knitr_1.48     

loaded via a namespace (and not attached):
 [1] gtable_0.3.5       xfun_0.48          htmlwidgets_1.6.4  GGally_2.2.1      
 [5] lattice_0.22-6     tzdb_0.4.0         vctrs_0.6.5        tools_4.4.1       
 [9] generics_0.1.3     parallel_4.4.1     fansi_1.0.6        pkgconfig_2.0.3   
[13] Matrix_1.7-0       checkmate_2.3.2    RColorBrewer_1.1-3 lifecycle_1.0.4   
[17] compiler_4.4.1     farver_2.1.2       textshaping_0.4.0  munsell_0.5.1     
[21] htmltools_0.5.8.1  yaml_2.3.10        pracma_2.4.4       crayon_1.5.3      
[25] pillar_1.9.0       nlme_3.1-164       ggstats_0.7.0      tidyselect_1.2.1  
[29] digest_0.6.37      mvtnorm_1.3-2      stringi_1.8.4      labeling_0.4.3    
[33] splines_4.4.1      cowplot_1.1.3      rprojroot_2.0.4    fastmap_1.2.0     
[37] here_1.0.1         colorspace_2.1-1   cli_3.6.3          magrittr_2.0.3    
[41] utf8_1.2.4         withr_3.0.2        scales_1.3.0       backports_1.5.0   
[45] bit64_4.5.2        timechange_0.3.0   rmarkdown_2.28     bit_4.5.0         
[49] ragg_1.3.3         zoo_1.8-12         hms_1.1.3          evaluate_1.0.1    
[53] mgcv_1.9-1         rlang_1.1.4        Rcpp_1.0.13        glue_1.8.0        
[57] renv_1.1.2         vroom_1.6.5        rstudioapi_0.17.0  jsonlite_1.8.9    
[61] R6_2.5.1           plyr_1.8.9         systemfonts_1.1.0